// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>

// includes, project
#include <cutil_inline.h>

// includes, kernels
#include <zad2_kernel.cu>

float* computeGold(float* h_input, int count) {
	float* res;
	res = (float*)malloc(sizeof(float) * count);

	unsigned int cpuTimer = 0;
	CUT_SAFE_CALL(cutCreateTimer(&cpuTimer));
	CUT_SAFE_CALL(cutStartTimer(cpuTimer));
	for(int i = 0; i < RADIUS; i++) {
		res[i] = h_input[i];
		res[count-i-1] = h_input[count-i-1];
	}
	for(int i = RADIUS; i < count - RADIUS; i++){
		res[i] = 0.0f;
		for(int j = -RADIUS; j <= RADIUS; j++)
			res[i] += h_input[i+j];
		res[i] /= (2 * RADIUS + 1.f);
	}
	CUT_SAFE_CALL(cutStopTimer(cpuTimer));
	printf("CPU processing time: %f (ms)\n",
		cutGetTimerValue(cpuTimer));
	CUT_SAFE_CALL(cutDeleteTimer(cpuTimer));
	return res;
}

int main( int argc, char** argv) {
	// wybiera GPU podane z linii komend,
	// badz te o najwiekszej ilosci GFLOPS.
	if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
		cutilDeviceInit(argc, argv);
	else
		cudaSetDevice( cutGetMaxGflopsDeviceId() );
	cudaSetDeviceFlags(cudaDeviceMapHost);
    int inputSize = 16*1024*1024;//max 31*...
    int mem_size = sizeof(float) * inputSize;

	float* h_input = 0;
	cutilSafeCall( cudaHostAlloc( (void**) &h_input, 
		mem_size, cudaHostAllocMapped));
	if(0 == h_input) {
		printf("Za malo pamieci operacyjnej.\n");
		return 1;
	}
	srand(time(NULL));
	for(int i = 0; i < inputSize; ++i)
		h_input[i] = (float)rand();

	float* d_input;
	float* d_output;
	//cutilSafeCall( cudaMalloc( (void**) &d_input, mem_size));
	cutilSafeCall( cudaHostGetDevicePointer((void**) &d_input,
		h_input, 0));
	cutilSafeCall( cudaMalloc( (void**) &d_output, mem_size));

	unsigned int gpuMemcpyHtDTimer = 0;
	CUT_SAFE_CALL(cutCreateTimer(&gpuMemcpyHtDTimer));
	CUT_SAFE_CALL(cutStartTimer(gpuMemcpyHtDTimer));
	//cutilSafeCall( cudaMemcpy( d_input, h_input, mem_size,
	//							cudaMemcpyHostToDevice) );
	CUT_SAFE_CALL(cutStopTimer(gpuMemcpyHtDTimer));
	printf("GPU Memcpy host to device time: %f (ms)\n",
		cutGetTimerValue(gpuMemcpyHtDTimer));

	// ustawia rozmiar kraty i bloku
    dim3  grid(inputSize / BLOCK_SIZE, 1, 1);
    dim3  threads(BLOCK_SIZE, 1, 1);

	unsigned int gpuProcessingTimer = 0;
	CUT_SAFE_CALL(cutCreateTimer(&gpuProcessingTimer));
	CUT_SAFE_CALL(cutStartTimer(gpuProcessingTimer));
    // uruchom kernel
    zad2Kernel<<< grid, threads >>>( d_input, d_output);

	cudaThreadSynchronize();
    // sprawdz, czy kernel zakonczyl sie sukcesem
    cutilCheckMsg("Kernel execution failed");

	CUT_SAFE_CALL(cutStopTimer(gpuProcessingTimer));
	printf("GPU processing time: %f (ms)\n", 
		cutGetTimerValue(gpuProcessingTimer));

	float* h_output;
	cutilSafeCall( cudaMallocHost( (void**) &h_output, mem_size));

	unsigned int gpuMemcpyDtHTimer = 0;
	CUT_SAFE_CALL(cutCreateTimer(&gpuMemcpyDtHTimer));
	CUT_SAFE_CALL(cutStartTimer(gpuMemcpyDtHTimer));
	// kopiuje wynik z GPU do pamieci komputera
	cutilSafeCall( cudaMemcpy( h_output, d_output, mem_size,
								cudaMemcpyDeviceToHost) );

	CUT_SAFE_CALL(cutStopTimer(gpuMemcpyDtHTimer));
	printf("GPU Memcpy device to host time: %f (ms)\n",
		cutGetTimerValue(gpuMemcpyDtHTimer));
	printf("GPU total time: %f (ms)\n",
		cutGetTimerValue(gpuProcessingTimer)
		+ cutGetTimerValue(gpuMemcpyDtHTimer)
		+ cutGetTimerValue(gpuMemcpyHtDTimer));

	float* goldRes = computeGold(h_input, inputSize);
	for(int i = 0; i < inputSize; ++i)
		if(abs(h_output[i] - goldRes[i]) > 0.01f)
			printf("%d = %f != %f\n", i, goldRes[i], h_output[i]);
	
	CUT_SAFE_CALL(cutDeleteTimer(gpuProcessingTimer));
	CUT_SAFE_CALL(cutDeleteTimer(gpuMemcpyHtDTimer));
	CUT_SAFE_CALL(cutDeleteTimer(gpuMemcpyDtHTimer));
	// zwalnianie pamieci
	cutilSafeCall(cudaFreeHost(h_input));
	cutilSafeCall(cudaFreeHost(h_output));
	free(goldRes);
//	cutilSafeCall(cudaFree(d_input));
	cutilSafeCall(cudaFree(d_output));

	cudaThreadExit();
	cutilExit(argc, argv);
}
